rm(list = ls())
# Load required libraries
library("EBMAforecast")
library("dplyr")
library("spduration")
library("magrittr")
library("xtable")
library("countrycode")
library("lubridate")

####################################################################################
load("data/paper_imputed.RData")
paper <- paper_imputed
# Theme models ------------------------------------------------------
#2010-04-01
# Dates:
# train = data used to estimate models
# calib + test = test data for theme model estimates
# calib = calibration period for ensemble
# test  = test period for ensemble
train_start <- as.Date("2010-04-01")
calib_start <- as.Date("2017-05-01")
test_start  <- as.Date("2019-01-01")
data_end    <- as.Date(max(paper$date))

# Add duration variables first, then drop missing
train <- paper %>% 
    filter(date < calib_start) %>%
    add_duration(., "battles_dum", "gid", "date", ongoing=FALSE) %>%
    filter(date >= train_start)

calib <- paper %>% filter(date < test_start) %>%
  add_duration(., "battles_dum", "gid", "date", ongoing=FALSE) %>%
  filter(date >= calib_start)

test <- paper %>% filter(date <= data_end) %>%
  add_duration(., "battles_dum", "gid", "date", ongoing=FALSE) %>%
  filter(date >= test_start)

#####################################################################################
#   Start model estimation
####################################################################################
# spatial w
model1 <- spduration::spdur(
          duration ~  battles_sw + battles_s1w + battles_s2w,
          atrisk ~ battles_sw + battles_s1w + battles_s2w,
          data = train, silent = TRUE)

# temporal
model2 <- spduration::spdur(
          duration ~ 1 +  battles_dum_lag1 + battles_dum_lag2 + battles_dum_lag3,
          atrisk ~1 +  battles_dum_lag1 + battles_dum_lag2 + battles_dum_lag3,
          data = train, silent = TRUE)

# environment
model3 <- spduration::spdur(
          duration ~ 1 + forest_gc+ urban_gc + rainseas + cmr_mean + water_gc + agri_gc + bdist1 +  capdist,
          atrisk ~ 1 + forest_gc+ urban_gc + rainseas + cmr_mean + water_gc + agri_gc + bdist1 +  capdist,
          data = train, silent = TRUE)
#spatial + temporal
model4 <- spduration::spdur(
          duration ~ 1 +  battles_sw + battles_s1w + battles_s2w + battles_dum_lag1 + battles_dum_lag2 + battles_dum_lag3,
          atrisk ~ 1 + battles_sw + battles_s1w + battles_s2w + battles_dum_lag1 + battles_dum_lag2 + battles_dum_lag3,
          data = train, silent = TRUE)
#full
model5 <- spduration::spdur(
          duration ~ 1 +  battles_sw + battles_s1w + battles_s2w + battles_dum_lag1 + 
                      battles_dum_lag2 + battles_dum_lag3 +  gdppc + log_pop_density,
          atrisk ~ 1 + battles_sw + battles_s1w + battles_s2w + battles_dum_lag1 + 
                     battles_dum_lag2 + battles_dum_lag3 +  gdppc + log_pop_density,
          data = train, silent = TRUE)

# Set number of models and their names. We will need this several times
# for the code below.
n_models <- 5
model_names <- c("空间扩散模型", "时间依赖模型", "地理环境模型",
                 "时空基础模型", "时空全模型")

save(model_names, model1, model2, model3, model4, model5,file="data/models.rda")

####################################################################################
# Ensemble 
####################################################################################
# Calculate theme model predictions
# Initilize matrices with columns for theme predictions and observed
pr_calib <- matrix(numeric(nrow(calib)*(n_models + 1)), ncol=(n_models + 1)) # 4284*8
pr_calib[, 1] <- calib$failure #observed
colnames(pr_calib) <- c("y", paste0("i", 1:n_models))
pr_test  <- matrix(numeric(nrow(test)*(n_models + 1)), ncol=(n_models + 1))
pr_test[, 1] <- test$failure
colnames(pr_test) <- c("y", paste0("i", 1:n_models))

# Loop through and fill in
for (i in 1:n_models) {
  model_i <- get(paste0("model", i))
  pr_calib[, i+1] <- predict(model_i, newdata=calib, type="conditional hazard")
  pr_test[, i+1]  <- predict(model_i, newdata=test, type="conditional hazard")
}

# fix exact 0's to slightly above 0, otherwise EBMA will not work
mod_idx <- grep("i[0-9]", colnames(pr_calib))
pr_calib[, mod_idx] <- replace(pr_calib[, mod_idx], pr_calib[, mod_idx] <= 0, 1e-19)
pr_test[, mod_idx]  <- replace(pr_test[, mod_idx], pr_test[, mod_idx] <= 0, 1e-19)
pr_calib[, mod_idx] <- replace(pr_calib[, mod_idx], pr_calib[, mod_idx] ==1, 0.9999999999999)
pr_test[, mod_idx]  <- replace(pr_test[, mod_idx], pr_test[, mod_idx] ==1, 0.9999999999999)
# Create ensemble data object
ens_df <- makeForecastData(
        .predCalibration     = pr_calib[, mod_idx],
        .outcomeCalibration = pr_calib[, 1],
        .predTest   		    = pr_test[, mod_idx],
        .outcomeTest 	    	= pr_test[, 1],
        .modelNames = model_names
)

# Calibrate ensemble model
ensemble <- EBMAforecast:::calibrateEnsemble(ens_df, model="logit", maxIter=25000, exp=3,
                                             const=0.000001)
save(ensemble, file = "results/ensemble.rda")

# Add ID info, theme preds, and ensemble pred
pr_calib <- cbind(calib[, c("gwno", "gid", "yearmonth",  "date")], pr_calib, ebma=ensemble@predCalibration[, 1, 1])
pr_test <- cbind(test[, c("gwno", "gid", "yearmonth",  "date")], pr_test, ebma=ensemble@predTest[, 1, 1])
save(pr_calib, file="results/pr_calib.rda")
save(pr_test, file="results/pr_test.rda")


####################################################################################
# Test forecasts ----------------------------------------------------------
#####################################################################################

#   Rolling 6-month forecasts over test period
source("code/fcast-funcs.R")
source("code/predict-ebma.R")

n_ahead <- 6
####################################################################################
#   Start of rolling 6-month forecasts
####################################################################################
pr_test6 <- data.frame(NULL)

# Vector of dates from which to do forecasts;
# dates describe the month of data used in generating the forecast
fcast_data_dates <- seq.Date(min(test$date), max(test$date), by = "month")
for (i in seq_along(fcast_data_dates)) {
  cat(paste0("Forecast from ", fcast_data_dates[i], "\n"))
  # Run fcast_ebma
  this_fcast <- fcast_ebma(fcast_data_dates[i], test, n_ahead)
  # Fill in observed values
  fcast_window_start <- fcast_data_dates[i] %m+% months(1)
  fcast_window_end   <- fcast_window_start  %m+% months(n_ahead - 1)
  obs_y <- test %>%
          dplyr::select(date, gid, failure) %>%
          filter(date >= fcast_window_start & date <= fcast_window_end) %>%
          group_by(gid) %>%
          dplyr::summarize(y = max(failure))
  # For windows that include unobserved future dates, change 0 to NA
  # we can keep 1 because we know already that at least 1 ILC has occurred
  if (fcast_window_end > max(test$date)) {
                    obs_y %<>% mutate(y = ifelse(y==0, NA, 1))
  }
  
  # Fill in observed y 
  this_fcast$y <- NULL
  this_fcast <- left_join(this_fcast, obs_y, by = "gid")
  
  # record all in giant matrix
  pr_test6 <- rbind(pr_test6, this_fcast)
}

save(pr_test6, file="results/pr_test6.rda")

####################################################################################
# Live forecasts 
####################################################################################

pr_fcast  <- fcast_ebma(max(paper$date), paper, 6, collapse=FALSE)
# Collapse 6-month forecast and get top 10 table
pr_fcast_agg <- pr_fcast %>%
  group_by(gid) %>%
  dplyr::summarize(
            date = min(date),
            y = max(y),
            i1 = p_agg(i1),
            i2 = p_agg(i2),
            i3 = p_agg(i3),
            i4 = p_agg(i4),
            ebma = p_agg(ebma)
  ) %>%
  arrange(desc(ebma)) 
save(pr_fcast, file="results/pr_fcast.rda")
save(pr_fcast_agg, file="results/pr_fcast_agg.rda")
####################################################################################




